* NOTE: if Stata does not do so automatically, cd to the top level directory of the data depository (where this script is stored)
* requires user-written packages cibar, colorpalette, grc1leg2, grstyle, sadi, sq (all from SSC)
* power tests require hours of computing time; for that reason, the forvalues parameter ranges (stratified, bandwidth, run) in lines 118-120 are currently set to single numbers that won't generate true power tests but make sure the script runs in reasonable time; if you actually want to run the power tests, adjust as desired

log using comparison_tests, replace
clear all
global sequencedata "data\sequence.dta"
global maindata "data\maindata.dta"

/****************************************************************************
******************************************************************************
	I. DOCUMENT VIEWS
******************************************************************************
****************************************************************************/

/***********************************************************
	Fig. 1
************************************************************/
use "$sequencedata", clear
sqset doc ID eventtime
grstyle init
grstyle set plain, nogrid
grstyle set color cblind
forvalues c==1/7 { // NB: I tried using a different ordering of sequences (edit distance clustered via single or Wards linkage, as Halpin recommends): it's inintelligble
	preserve
	keep if country==`c'
	egen id = group(ID)
	sqset doc id eventtime
	sqindexplot, scheme(_GRSTYLE_) plotregion(margin(zero)) title("`: label country `c''") xtitle("Normalized Time") xlabel(17 "start" 482 "finish", labsize(*.5) notick) ytitle("Judge") ylabel(minmax) name(p`c', replace) nodraw ///
	legend(cols(1) order(2 "facts" 1 "briefs" 6 "trial judgment" 5 "statute" 4 "precedent") size(*.75)) 
	restore
}
grc1leg2 p1 p2 p3 p4 p5 p6 p7, scheme(s1color) xcommon rows(3) holes(4 7) imargin(tiny) graphregion(margin(r+8)) xtob1title ytol1title ring(0) pos(7) title("Fig. 1: Document View Paths by Country") ///
	note("By country (plot), the graph shows for each judge (vertical axis) the documents live on that judge's screen from start to finish (horizontal axis). Orange indicates instruction views.", size(*.6))
graph play fig1_com_civ_sides
graph save graphs\sequences, replace

/***********************************************************
	Cross-Country Comparisons Using Edit Distance
***********************************************************/

use "$sequencedata", clear
reshape wide doc, i(country ID) j(eventtime)
sort ID

* simple grouping: average distance between countries' judges
preserve
	matrix scost = (J(6,6,1) - I(6))
	oma doc*, subsmat(scost) indel(1) length(500) pwdist(ED) dups
	keep ID country
	tempfile ID
	save `ID'
	clear
	svmat ED
	merge 1:1 _n using `ID', assert(3) nogenerate
	rename (ID country) =1
	reshape long ED, i(ID1 country1) j(ID)
	merge m:1 ID using `ID', assert(3) nogenerate
	rename (ID country) =2
	collapse (mean) meandistance=ED (count) samplesize=ED, by(country1 country2)
	forvalues i=1/6 {
	forvalues j=`=`i'+1'/7 { 
		drop if country2==`j' & country1==`i' // eliminating duplicates B-A of A-B
	}
	}
	sort meandistance
	list country* meandistance samplesize
restore

* test of differences by country and by common vs. civil law
gen commonL = inlist(country,"India":country,"USA":country) + 1 // email from B Halpin 2019/12/2: sddiscrep assumes group variable starts at 1, otherwise will throw an error
foreach subscost in 1 1.5 { // 1 is Levenshtein
di _n "*** Substitution cost `subscost' ***"
matrix scost = `subscost'*(J(6,6,1) - I(6))
forvalues prime=-1/1 {
	preserve
	if `prime'>-1 {
		decode ID, gen(randomID)
		merge 1:1 random using "$maindata", assert(2 3) keep(3) nogenerate keepusing(prime)
		keep if prime==`prime'
		drop prime random
		}
	sort ID
	oma doc*, subsmat(scost) indel(1) length(500) pwdist(ED) dups
	sddiscrep country, dist(ED) id(ID) niter(100000)
	sddiscrep commonL, dist(ED) id(ID) niter(1000) // problem: the test here should be for groups of countries not for groups of individuals (akin to clustering) --> permutation test in next lines
	scalar Rstar = r(pseudoR2)
	cap matrix drop R2
	forvalues i=1/5 { // not running it for i=6 (-->j=7) because that is the actual test statistic
	forvalues j=`=`i'+1'/7 { 
		qui replace commonL = inlist(country,`i',`j') + 1
		qui sddiscrep commonL, dist(ED) id(ID) niter(1)
		matrix R2 = (nullmat(R2) \ r(pseudoR2))
	}
	}
	clear
	svmat R2
	count if R21>=Rstar
	di "country-permutation p-value of common v. civil law for prime=`prime': " =(r(N)+1)/(_N+1) // cf. Lehmann & Romano eq 15.5 and note N, _N do not include the actual com/civ pair
	restore
}
}

*** "Power" tests for common vs. civil law (cf. comments for actual test above)

use "$sequencedata", clear
qui reshape wide doc, i(country ID) j(eventtime)
matrix scost = (J(6,6,1) - I(6))


cap mat drop results
tempfile countries
preserve
keep country
save `countries'
restore

forvalues stratified = 1/1 { // do we draw random data: 0 -- treating all judges as coming from the same population, or 1 -- keeping judges from same country together
forvalues bandwidth=250/250 { // how many steps around 250 to modify (interval from +/- bandwidth around 250))
forvalues run=1/1 { // how many times to run the simulation (i.e., generate random data and check if the test rejects the null)
preserve

	* random data
	if `stratified' {
		bsample, cluster(country) idcluster(cluster) // bootstrap sample of countries
		bsample, strata(cluster) // bootstrap sample of judges within (resampled) countries, i.e., varying countries while keeping judges from same country together
		gen shuffle = runiform() // this and next 4 lines assign random group numbers 1-7 to the resampled "countries", i.e., randomize who is common law
		bys cluster: replace shuffle = shuffle[1] // this gives every judge within cluster (i.e., resampled country) the same random shuffle value
		sort shuffle
		egen newcountry = group(shuffle) // "egen group" creates integer group indicators 1-..., where the numbers are in the order of the argument, i.e., shuffle
		replace country = newcountry
	}
	else {
		bsample // bootstrap sample, treating all judges as coming from the same population
		gen shuffle = runiform()
		sort shuffle
		merge 1:1 _n using `countries', update replace assert(3 4 5) nogenerate // randomly assign country (NB: this keeps sizes of common and civil law countries the same!)
	}

	* common law effect
	gen commonL = inlist(country,"India":country,"USA":country) + 1 // need +1 for sddiscrep to run: arguments must be integer >=1
	forvalues changeddoc = `=max(1,250-`bandwidth')'/`=250+`bandwidth'' { // if bandwidth=250, then the lower bound would evaluate to zero, which is outside the time range
			replace doc`changeddoc' = "precedent":doc if commonL==2
			replace doc`changeddoc' = "statute":doc if commonL==1
		}

	* ind'l distance metric
	replace ID = _n // ID needs to be unique for sddiscrep
	sort ID
	oma doc*, subsmat(scost) indel(1) length(500) pwdist(ED) dups

	* tests of common law explanatory power
	* -- a) permuting individual judges
	sddiscrep commonL, dist(ED) id(ID) niter(1000)
	scalar p_indl_perm = r(p_perm)
	scalar Rstar = r(pseudoR2)
	* -- b) permuting "countries"
	local counter = 0
	forvalues i=1/5 { // not running it for i=6 (-->j=7) because that is the actual test statistic
	forvalues j=`=`i'+1'/7 { 
		qui replace commonL = inlist(country,`i',`j') + 1
		qui sddiscrep commonL, dist(ED) id(ID) niter(1)
		local counter = `counter' + (r(pseudoR2)>=Rstar)
		}
		}

	matrix results = (nullmat(results) \ `stratified', `bandwidth', `run', Rstar, p_indl_perm, `counter', `=(`counter'+1)/22')

restore
}
}
}

mat colnames results = stratified bandwidth run R2 p greaterR2 p_cluster
clear
svmat results, names(col)
gen rejected = p_cluster<.05
assert rejected==(greaterR2==0)
collapse (count) runs=rejected (sum) nrejected=rejected, by(stratified bandwidth)
gen power = nrejected/runs
sort stratified bandwidth
list stratified bandwidth power runs


/***********************************************************
	Summary statistics, simple comparisons, & correlations with reasons
************************************************************/
use "$sequencedata", clear
contract country ID doc, freq(t_)
replace t_=t_/500
decode doc, gen(document)
drop doc
reshape wide t_, i(country ID) j(document) string
egen total = rowtotal(t_*)
assert round(total,.000001)==1
recode t_* (. = 0)
table country, contents(mean t_precedent mean t_statute) format(%5.3f)

count if t_prec==0
di %4.2f (_N-r(N))/_N // fraction of judges consulting precedent
count if t_statute==0
di %4.2f (_N-r(N))/_N // fraction of judges consulting statute

manova t_briefs t_facts t_precedent t_statute t_trial = country
gen commonL = inlist(country,"India":country,"USA":country)
manova t_briefs t_facts t_precedent t_statute t_trial = commonL

decode ID, gen(randomID)
merge 1:1 randomID using "$maindata", assert(3) keepusing(statute prec_mentioned_gen) nogenerate
gen Statute = statute/4 // resolving coder disagreement by averaging
gen Precedent = prec_mentioned_gen/4
sum Precedent Statute // fraction of judges mentioned precedent and statute
pwcorr Statute Precedent t_statute t_prec, sig
spearman Statute Precedent t_statute t_prec, stats(rho p)
	

/****************************************************************************
******************************************************************************
	II. JUDGMENT REASONS
******************************************************************************
****************************************************************************/

use statute policy prec_mentioned_gen judgmentreasons country prime using "$maindata", clear
drop if mi(judgmentreasons)
local elements "Statute Policy Precedent" 
rename (statute policy prec_mentioned_gen) (`elements') // alternative to prec_mentioned_gen: prec_mentioned_spec prec_disti -- graph qualitatively similar
gen words = wordcount(judgmentreasons)
bys country: egen cwords = mean(words)
foreach element in `elements' {
	replace `element' = `element'/4 // this deals with coder disagreement: thus far we had scored each variable by 0-4 coders checking a feature
	gen `element'_sc = `element'/cwords // this scales down a mention by the average length of reasons for that country
	local elements_sc "`elements_sc' `element'_sc"
	}
gen commonL = inlist(country,"India":country,"USA":country)

* importance of scaling
sum words
tabstat words, by(country) format(%6.2f)

/***********************************************************
	Fig. 2
************************************************************/
label list country // to check if next line is correct!
local legend legend(off) // alternatively: "legend(order(- "Civil Law:" 1 "Argentina" 2 "Brazil" 3 "China" 4 "France" 5 "Germany" - "Common Law:" 6 "India" 7 "USA") cols(6) size(*.5))" -- in that case use grc1leg instead of graph combine below
local countrylabels "xlabel(1 "AR" 2 "BR" 3 "CN" 4 "FR" 5 "DE" 6 "IN" 7 "US", labsize(*2))" // consider turning off if turning previous line on
local cibaropts "over(country) barcol(ltblue green red navy gold orange blue) vce(bootstrap) ciopts(lwidth(*4))"
local gphopts "scheme(s1color) ytitle("") xtitle("") nodraw `legend'"
cibar Statute,		`cibaropts' graphopts(`gphopts' ylabel(, labsize(*1.5))	plotregion(margin(b=0))	name(Statute	 , replace) title("Statute",   size(*2.5))) // not sure the plotregion here and imargin below have any effect
cibar Policy,		`cibaropts' graphopts(`gphopts' ylabel(,notick nolabel)	plotregion(margin(b=0))	name(Policy		 , replace) title("Policy",	   size(*2.5)))
cibar Precedent,	`cibaropts' graphopts(`gphopts' ylabel(,notick nolabel)	plotregion(margin(b=0))	name(Precedent   , replace) title("Precedent", size(*2.5)))
cibar Statute_sc,	`cibaropts' graphopts(`gphopts' ylabel(, labsize(*1.5))	`countrylabels'			name(Statute_sc  , replace) title(""))
cibar Policy_sc,	`cibaropts' graphopts(`gphopts' ylabel(,notick nolabel)	`countrylabels'			name(Policy_sc   , replace) title(""))
cibar Precedent_sc,	`cibaropts' graphopts(`gphopts' ylabel(,notick nolabel)	`countrylabels'			name(Precedent_sc, replace) title(""))
graph combine Statute Policy Precedent,			 scheme(s1color) ycommon xcommon altshrink rows(1) imargin(b=0)										 			  name(raw, replace)
graph combine Statute_sc Policy_sc Precedent_sc, scheme(s1color) ycommon xcommon altshrink rows(1) title("{it:---Scaled by Average Length (Words)---}", size(*2)) name(sc, replace)
graph combine raw sc, scheme(s1color) xcommon altshrink rows(2) title("Fig. 2: Prevalence of Reasons by Country") ///
	note("Country means and 95% bootstrap confidence intervals of mentions of particular reasons by individual judges." ///
	"In the upper panel, the mean is taken over indicators whether an individual judge mentioned the feature in the" ///
	"reasons (scored as the number of 0-4 coders who so coded this judge, divided by four). In the lower panel, the" ///
	"country mean is divided by the average number of words in that country's translated reasons.")
graph play fig2_com_civ_sides
graph save graphs\reasons_prevalence_revised2.gph, replace

/***********************************************************
	Comparison: Country and Com/Civ differences
************************************************************/

* size of country vs. com/civ differences
foreach element in `elements' `elements_sc' {
	qui ttest `element', by(commonL)
	scalar comcivdiff = abs(r(mu_2)-r(mu_1))
	qui ttest `element' if commonL, by(country)
	scalar USIndiadiff = abs(r(mu_2)-r(mu_1))
	di USIndiadiff-comcivdiff
}

* tests of country differences
manova `elements' = country // to get the value of Pillai's trace etc.
permute country (e(stat_m)[2,1]), reps(10000) dots(100): manova `elements'	  = country // country test: permute Pillai's trace
manova `elements_sc' = country
permute country (e(stat_m)[2,1]), reps(10000) dots(100): manova `elements_sc' = country

* tests of common/civil differences
program define modHotelT2, rclass // modified Hotelling's T a la Chung & Romano J Econometrics 2016
		forvalues j=0/1 { // j=0/1 will call civil and common law, respectively
			qui mat accum C`j' = `0' if commonL==`j', noconstant deviations means(M`j') // matrices of (N-1)*cov and means, cf. stata.com/support/faqs/data-management/saving-stats/
			mat C`j' = C`j' / ((r(N)-1)*r(N)) // I already apply here the weight 1/n_group for the summation of the two group-specific covariance matrices
		}
		mat T2 = (M1-M0)*invsym(C0+C1)*(M1-M0)' // Chung & Romano equations 10-13 (rearranged by pulling the scale factor m^(-1/2) into the sums)
		return scalar T2 = T2[1,1]
		end
forvalues prime=-1/1 { // -1 will be all observations; 0 and 1 runs the sample separately for those that are primed and those that are not
	preserve // needed to undo next line and permutation of commonL grouping
	if `prime'>-1 keep if prime==`prime'
	local p = 1 // the actual test statistic itself is equal to itself -- and the code below saves time by not calculating that permutation
	local B = 1 // cf. prior line -- also adding to denominator
	modHotelT2 `elements'
	scalar t_star = r(T2)
	forvalues i=1/5 { // not running it for i=6 (-->j=7) because that is the actual test statistic
	forvalues j=`=`i'+1'/7 { 
		qui replace commonL = inlist(country,`i',`j')
		modHotelT2 `elements'
		if r(T2) >= t_star local p = `p' + 1
		local B = `B' + 1
		}
		}
	di "prime " `prime' ", T2 = " %4.2f t_star ", p = " %4.2f `p'/`B'
	restore
	}
	
log close